[507a54]: / development / paraphrase / randomize entities.py

Download this file

67 lines (43 with data), 1.9 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import re
from ruamel import yaml
import numpy as np
import sys
def load_entities(entity_names):
entities_dict = dict()
for entity in entity_names:
entity_file_adr = f'./{entity}.yml'
entities_dict[entity] = load_entities_list(entity_file_adr)
return entities_dict
def load_entities_list(file_adr):
with open(file_adr, 'r') as f:
entities = yaml.load(f, Loader=yaml.RoundTripLoader)
entities = entities['nlu'][0]
entities = entities['examples'].split('\n')
entities = [row[2:] for row in entities]
return entities
def randomize_entity_names(nlu_dict, entities_dict):
for intent in nlu_dict['nlu']:
examples = ''
for example in intent['examples'].split('\n'):
entity_match = dict()
for entity in entities_dict:
entity_match = re.search(f'\[[^\]]*\]\({entity}\)', example)
if entity_match:
start, end = entity_match.span()
random_entity_name = f'[{np.random.choice(entities_dict[entity]).strip()}]({entity})'
example = example.replace(example[start:end] , random_entity_name)
examples += example + '\n'
intent['examples'] = examples[:-1] #removing last \n to avoid \n\n after the last example
return nlu_dict
if __name__=='__main__':
NLU_FILE = './nlu_cleaned.yml'
ENTITY_NAMES = ['drug', 'lab'] #lookup files should be in same directoty as this file
OUTPUT_FILE = 'nlu_random.yml'
#load files
with open(NLU_FILE, 'r') as f:
nlu = yaml.load(f, Loader=yaml.RoundTripLoader)
entities = load_entities(ENTITY_NAMES)
randomized_nlu = randomize_entity_names(nlu, entities)
#save
with open(OUTPUT_FILE, 'w') as f:
yaml.dump(randomized_nlu, f, Dumper=yaml.RoundTripDumper, default_flow_style=None)